import pandas as pd
train = pd.read_csv('/Users/ruohezhou/Documents/451project/train.csv')
valid = pd.read_csv('/Users/ruohezhou/Documents/451project/valid.csv')
print(len(train))
print(len(valid))
frames = [train, valid]
df = pd.concat(frames) #merge train and valid dataset into one dataset
print(len(df))
train.head()
df.iloc[0, 2]
#only run one time, otherwise run from the first chunk where df just got read and merged
df['Tags'] = df['Tags'].apply(lambda x: x[1:-1].split('><'))
df['Body'] = df['Body'].apply(lambda x: x[3:-6])
df.head()
df['Y'].unique()
df_high = df[df['Y'] == 'HQ']
df_high = df_high.drop(['Id', 'Tags', 'CreationDate'], axis=1)
df_high.head()
df_low = df[(df['Y'] == 'LQ_CLOSE') | (df['Y'] == 'LQ_EDIT')]
df_low = df_low.drop(['Id', 'Tags', 'CreationDate'], axis=1)
df_low.head()
import string
string.punctuation
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import copy
df_high_copy = copy.deepcopy(df_high)
df_low_copy = copy.deepcopy(df_low)
def clean(dataset):
for i in range(len(dataset)):
#string_cleaned = dataset.iloc[i, 1].replace('<p>', '').replace('</p>', '').replace('</a>', '')
m = re.sub("[\(\[].*?[\)\]]", " ", dataset.iloc[i, 1])
m1 = re.sub("<.*?>", " ", m)
m2 = re.sub("\n", " ", m1)
m3 = re.sub("{.*?}", " ", m2)
m4 = re.sub(">", "", m3)
string_cleaned = re.sub("<", "", m4)
w = string_cleaned.split()
resultwords = [word for word in w if word.lower() not in stop_words]
resultwords1 = [word for word in resultwords if word not in punctuation]
dataset.iloc[i, 1] = ' '.join(resultwords1)
return dataset
def wordcloud_array(dataset):
comment_words = ''
for idx in range(len(dataset)):
ind = dataset.iloc[idx, 1].split()
comment_words += " ".join(ind)+" "
return comment_words
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud (
background_color = 'white',
width = 800,
height = 800,
random_state = 123).generate(wordcloud_array(clean(df_high_copy)))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
plt.show()
wordcloud_notclean = WordCloud (
background_color = 'pink',
width = 800,
height = 800,
random_state = 123).generate(wordcloud_array(df_high))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud_notclean) # image show
plt.axis('off') # to off the axis of x and y
plt.show()
wordcloud_low = WordCloud (
background_color = 'black',
width = 800,
height = 800,
random_state = 123).generate(wordcloud_array(clean(df_low_copy)))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud_low) # image show
plt.axis('off') # to off the axis of x and y
plt.show()
wordcloud_low_notclean = WordCloud (
colormap='RdYlGn',
width = 800,
height = 800,
random_state = 123).generate(wordcloud_array(df_low))
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud_low_notclean) # image show
plt.axis('off') # to off the axis of x and y
plt.show()
import string
string.punctuation
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
for i in range(len(df_high)):
string_cleaned = df_high.iloc[i, 1].replace('<p>', '').replace('</p>', '').replace('</a>', '')
w = string_cleaned.split()
resultwords = [word for word in w if word.lower() not in stop_words]
resultwords1 = [word for word in resultwords if word not in punctuation]
df_high.iloc[i, 1] = ' '.join(resultwords1)
comment_words = ''
for idx in range(len(df_high)):
ind = df_high.iloc[idx, 1].split()
comment_words += " ".join(ind)+" "
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud (
background_color = 'black',
width = 800,
height = 800
).generate(comment_words)
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
#plt.savefig('Plotly-World_Cloud.png')
plt.show()
### tags wordcloud
import numpy as np
label = []
for k in df.iloc[:, 3]:
k = (','.join(k))
label.append(k)
import numpy as np
label = []
for k in df.iloc[:, 3]:
k = (','.join(k))
label.append(k)
#label = list(np.unique(label)) #remove repetitive words
cloud = ''
for i in label:
cloud += i + ' '
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud (
background_color = 'pink',
width = 800,
height = 800
).generate(cloud)
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
#plt.savefig('Plotly-World_Cloud.png')
plt.show()
import re
from itertools import chain
from nltk.tokenize import word_tokenize
list2 = []
for i in range(len(df)):
list2.append(word_tokenize(re.sub(r'\W+', ' ',df.iloc[i, 2])))
wordlist = list(chain.from_iterable(list2))
#remove non-alphanumeric values
cleanlist = []
for k in wordlist:
if k.isalnum() == True:
cleanlist.append(k) #len(wordlist) = 8501886, len(cleanlist) = 8356337
cloud1 = ''
for i in cleanlist:
cloud1 += i + ' '
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
import matplotlib.pyplot as plt
wordcloud = WordCloud(
colormap='RdYlGn',
stopwords = stopwords,
background_color = 'white',
width = 800,
height = 800
).generate(cloud1)
plt.figure(figsize=(10,10)) #adjust figure size before calling imshow
plt.imshow(wordcloud) # image show
plt.axis('off') # to off the axis of x and y
#plt.savefig('Plotly-World_Cloud.png')
plt.show()
data = df.drop(['Id', 'Tags', 'CreationDate'], axis=1)
mat = {'LQ_CLOSE':0, 'LQ_EDIT': 1, 'HQ':2}
data['Y'] = data['Y'].map(mat)
data.head()
data['Y'].value_counts()
import matplotlib.pyplot as plt
labels = ['High Quality Questions', 'Low Quality Question - Close', 'Low Quality Question - Edit']
values = [len(data[data['Y'] == 2]), len(data[data['Y'] == 0]), len(data[data['Y'] == 1])]
#plt.figure(figsize=(16, 9))
plt.pie(x=values, labels=labels, autopct="%1.1f%%", colors = ['pink', 'olive', 'cyan'])
plt.title("Y Value Distribution")
plt.show()
data['Body_length'] = data['Body'].apply(len)
data['Title_length'] = data['Title'].apply(len)
data.head()
import seaborn as sns
emptygrid = sns.FacetGrid(data,col='Y',size=3)
emptygrid.map(plt.hist,'Body_length')
data.corr()
emptygrid1 = sns.FacetGrid(data,col='Y',size=3)
emptygrid1.map(plt.hist, 'Title_length')
import re
def clean_text(text):
text = text.lower()
text = re.sub(r'[^(a-zA-Z)\s]','', text)
return text
data['Body'] = data['Body'].apply(clean_text)
data.head()
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_withstop = vectorizer.fit_transform(data['Body'])
y_withstop = data['Y'].values
from sklearn.model_selection import train_test_split
X_train_withstop, X_test_withstop, y_train_withstop, y_test_withstop = \
train_test_split(X_withstop, y_withstop, test_size=0.2,
shuffle=True, random_state=123, stratify=y_withstop)
# with stopwords
from sklearn.ensemble import RandomForestClassifier
forest1 = RandomForestClassifier(n_estimators=100,
random_state=123,
max_depth = 50)
forest1.fit(X_train_withstop, y_train_withstop)
print(f"Train Accuracy: {forest1.score(X_train_withstop, y_train_withstop)*100:0.3f}%")
print(f"Test Accuracy: {forest1.score(X_test_withstop, y_test_withstop)*100:0.3f}%")
from xgboost import XGBClassifier
xg_classifier1 = XGBClassifier(random_state = 123)
xg_classifier1.fit(X_train_withstop, y_train_withstop)
print(f"Train Accuracy: {xg_classifier1.score(X_train_withstop, y_train_withstop)*100:0.3f}%")
print(f"Test Accuracy: {xg_classifier1.score(X_test_withstop, y_test_withstop)*100:0.3f}%")
from catboost import CatBoostClassifier
boost1 = CatBoostClassifier(verbose=0)
boost1.fit(X_train_withstop, y_train_withstop)
print(f"Train Accuracy: {boost1.score(X_train_withstop, y_train_withstop)*100:0.3f}%")
print(f"Test Accuracy: {boost1.score(X_test_withstop, y_test_withstop)*100:0.3f}%")
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
print(stopwords.words('english'))
data1 = copy.deepcopy(data)
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
#only run one time, took 15 minutes
#for i in range(len(df)):
#w = data1.iloc[i, 1].split()
#resultwords = [word for word in w if word.lower() not in stop_words]
#data1.iloc[i, 1] = ' '.join(resultwords)
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(clean(data1)['Body'])
y = data1['Y'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2,
shuffle=True, random_state=123, stratify=y)
X_train_title, X_test_title, y_train_title, y_test_title = \
train_test_split(X, y, test_size=0.2,
shuffle=True, random_state=123, stratify=y)
# without stopwords
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print(f"Train Accuracy: {knn.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {knn.score(X_test, y_test)*100:0.3f}%")
# without stopwords
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100,
random_state=123,
max_depth = 50)
forest.fit(X_train, y_train)
print(f"Train Accuracy: {forest.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {forest.score(X_test, y_test)*100:0.3f}%")
from xgboost import XGBClassifier
xg_classifier = XGBClassifier(random_state = 123)
xg_classifier.fit(X_train, y_train)
print(f"Train Accuracy: {xg_classifier.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {xg_classifier.score(X_test, y_test)*100:0.3f}%")
from catboost import CatBoostClassifier
boost = CatBoostClassifier(verbose=0, random_state = 123)
boost.fit(X_train, y_train)
print(f"Train Accuracy: {boost.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test, y_test)*100:0.3f}%")
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 123)
lr.fit(X_train, y_train)
print(f"Train Accuracy: {lr.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {lr.score(X_test, y_test)*100:0.3f}%")
# 在titles上run各种模型
from catboost import CatBoostClassifier
boost = CatBoostClassifier(verbose=0)
boost.fit(X_train_title, y_train_title)
print(f"Train Accuracy: {boost.score(X_train_title, y_train_title)*100:0.3f}%")
print(f"Test Accuracy: {boost.score(X_test_title, y_test_title)*100:0.3f}%")
#title cannot be used for predicting.
list_corpus = data["Body"].tolist()
list_labels = data["Y"].tolist()
X_train_lime, X_test_lime, y_train_lime, y_test_lime = train_test_split(list_corpus, list_labels, test_size=0.2, random_state=40)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
train_vectors = vectorizer.fit_transform(X_train_lime)
test_vectors = vectorizer.transform(X_test_lime)
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(train_vectors, y_train_lime)
pred = logreg.predict(test_vectors)
accuracy = accuracy_score(y_test_lime, pred)
precision = precision_score(y_test_lime, pred, average='weighted')
recall = recall_score(y_test_lime, pred, average='weighted')
f1 = f1_score(y_test_lime, pred, average='weighted')
print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
c = make_pipeline(vectorizer, logreg)
class_names=list(data['Y'].unique())
explainer = LimeTextExplainer(class_names=class_names)
from xgboost import XGBClassifier
xg = XGBClassifier(random_state = 123)
xg.fit(train_vectors, y_train_lime)
d = make_pipeline(vectorizer, xg)
class_names=list(data['Y'].unique())
explainer_d = LimeTextExplainer(class_names=class_names)
from sklearn.ensemble import RandomForestClassifier
fo = RandomForestClassifier(n_estimators=100,
random_state=123,
max_depth = 50)
e = make_pipeline(vectorizer, xg)
class_names=list(data['Y'].unique())
explainer_e = LimeTextExplainer(class_names=class_names)
for idx in range(3):
exp = explainer_e.explain_instance(X_test_lime[idx], e.predict_proba, num_features=6)
exp.show_in_notebook()
for idx in range(3):
exp = explainer_d.explain_instance(X_test_lime[idx], d.predict_proba, num_features=6)
exp.show_in_notebook()
#High_quality : 2
for idx in range(3):
exp = explainer.explain_instance(X_test_lime[idx], c.predict_proba, num_features=6)
exp.show_in_notebook()
from mlxtend.evaluate import mcnemar
from mlxtend.evaluate import mcnemar_table
tb = mcnemar_table(y_target=y_train,
y_model1=xg_classifier.predict(X_train),
y_model2=lr.predict(X_train))
tb_test = mcnemar_table(y_target=y_test,
y_model1=xg_classifier.predict(X_test),
y_model2=lr.predict(X_test))
from mlxtend.plotting import checkerboard_plot
import matplotlib.pyplot as plt
brd = checkerboard_plot(tb,
figsize=(6, 6),
fmt='%d',
col_labels=['model 2 correct (train)', 'model 2 wrong (train)'],
row_labels=['model 1 correct (train)', 'model 1 wrong (train)'])
brd_test = checkerboard_plot(tb_test,
figsize=(6, 6),
fmt='%d',
col_labels=['model 2 correct (test)', 'model 2 wrong (test)'],
row_labels=['model 1 correct (test)', 'model 1 wrong (test)'])
plt.show()
chi2, p = mcnemar(ary=tb, corrected=True)
print('chi-squared-for-train:', chi2)
print('p-value-for-train:', p)
chi2, p = mcnemar(ary=tb_test, corrected=True)
print('chi-squared-for-test:', chi2)
print('p-value-for-test:', p)
from mlxtend.evaluate import confusion_matrix
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_confusion_matrix
cm = confusion_matrix(y_target=y_test,
y_predicted=xg_classifier.predict(X_test),
binary=True,
positive_label=1)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()
cm_lr= confusion_matrix(y_target=y_test,
y_predicted=lr.predict(X_test),
binary=True,
positive_label=1)
fig, ax = plot_confusion_matrix(conf_mat=cm_lr)
plt.show()